# coding:utf-8
'''
抓取 shengxu6 小说网的小说 "冥妻"
http://www.shengxu6.com/book/2967.html
'''
import urllib2
import re
from bs4 import BeautifulSoup

import sys
reload(sys)
sys.setdefaultencoding('utf8')

def Main():
    url = "http://www.shengxu6.com/book/2967.html"
    main_page = OpenPage(url)
    url_list = ParseMainPage(main_page)
    for url in url_list:
        print "crawler url=" + url
        detail_page = OpenPage(url)
        data = ParseDetailPage(detail_page)
        WriteDataToFile("./result.txt", data)
    print "crawler done!"

def OpenPage(url):
    headers = {}
    req = urllib2.Request(url, headers=headers)
    f = urllib2.urlopen(req)
    data = f.read()
    return data.decode('GBK', errors="ignore").encode('UTF-8')

def ParseMainPage(page):
    soup = BeautifulSoup(page, 'html.parser')
    list_charts = soup.find_all(href=re.compile("read"))
    url_list = ['http://www.shengxu6.com' + item['href'] for item in list_charts]
    return url_list

def ParseDetailPage(page):
    soup = BeautifulSoup(page, 'html.parser')
    content = soup.find_all(class_="content-body")[0].get_text()
    return content

def WriteDataToFile(file_path, data):
    '''
    数据写入到文件
    '''
    f = open(file_path, 'a+')
    f.write(data)
    f.close()

def Test1():
    url = "http://www.shengxu6.com/book/2967.html"
    page = OpenPage(url)
    result = ParseMainPage(page)
    print result

def Test2():
    # url = "http://www.shengxu6.com/read/2967_2008175.html"
    url = "http://www.shengxu6.com/read/2967_2008289.html"
    page = OpenPage(url)
    result = ParseDetailPage(page)
    print result

def Test():
    # Test1()
    Test2()

# Test()
Main()
